From 4054de8477b3ba4addb69cf9783dfb611f60faa6 Mon Sep 17 00:00:00 2001
From: Daniel Sabo <DanielSabo@gmail.com>
Date: Sat, 30 Mar 2013 08:48:52 -0700
Subject: [PATCH] Add SSE2 conversions

This patch includes two conversions for RaGaBaA -> RGBA. Depending
on the CPU either spin or shuffle is significantly faster. Unless
I can find a consistently fast version I'm going to let them fight
it out in the babl startup benchmarks.
---
 configure.ac            |  29 +++-
 extensions/Makefile.am  |  18 ++-
 extensions/sse2-float.c | 299 ++++++++++++++++++++++++++++++++++++++++
 extensions/sse2-int16.c | 186 +++++++++++++++++++++++++
 4 files changed, 525 insertions(+), 7 deletions(-)
 create mode 100644 extensions/sse2-float.c
 create mode 100644 extensions/sse2-int16.c

diff --git a/configure.ac b/configure.ac
index ce5a872..296ec27 100644
--- a/configure.ac
+++ b/configure.ac
@@ -294,9 +294,14 @@ AC_ARG_ENABLE(sse,
   [  --enable-sse            enable SSE support (default=auto)],,
   enable_sse=$enable_mmx)
 
+AC_ARG_ENABLE(sse2,
+  [  --enable-sse2            enable SSE2 support (default=auto)],,
+  enable_sse2=$enable_sse)
+
 if test "x$enable_mmx" = xyes; then
   BABL_DETECT_CFLAGS(MMX_EXTRA_CFLAGS, '-mmmx')
   SSE_EXTRA_CFLAGS=
+  SSE2_EXTRA_CFLAGS=
 
   AC_MSG_CHECKING(whether we can compile MMX code)
 
@@ -309,8 +314,11 @@ if test "x$enable_mmx" = xyes; then
     AC_MSG_RESULT(yes)
 
     if test "x$enable_sse" = xyes; then
+      BABL_DETECT_CFLAGS(fpmath_flag, '-mfpmath=sse')
+      SSE_EXTRA_CFLAGS="$MMX_EXTRA_CFLAGS $fpmath_flag"
+      
       BABL_DETECT_CFLAGS(sse_flag, '-msse')
-      SSE_EXTRA_CFLAGS="$MMX_EXTRA_CFLAGS $sse_flag"
+      SSE_EXTRA_CFLAGS="$SSE_EXTRA_CFLAGS $sse_flag"
 
       AC_MSG_CHECKING(whether we can compile SSE code)
 
@@ -325,6 +333,24 @@ if test "x$enable_mmx" = xyes; then
         AC_MSG_WARN([The assembler does not support the SSE command set.])
       )
 
+      if test "x$enable_sse2" = xyes; then
+        BABL_DETECT_CFLAGS(sse2_flag, '-msse2')
+        SSE2_EXTRA_CFLAGS="$SSE_EXTRA_CFLAGS $sse2_flag"
+
+        AC_MSG_CHECKING(whether we can compile SSE2 code)
+
+        CFLAGS="$CFLAGS $sse2_flag"
+
+        AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("punpckhwd %xmm0,%xmm1");])],
+          AC_DEFINE(USE_SSE2, 1, [Define to 1 if SSE2 assembly is available.])
+          AC_MSG_RESULT(yes)
+        ,
+          enable_sse2=no
+          AC_MSG_RESULT(no)
+          AC_MSG_WARN([The assembler does not support the SSE2 command set.])
+        )
+      fi
+
     fi
   ,
     enable_mmx=no
@@ -336,6 +362,7 @@ if test "x$enable_mmx" = xyes; then
 
   AC_SUBST(MMX_EXTRA_CFLAGS)
   AC_SUBST(SSE_EXTRA_CFLAGS)
+  AC_SUBST(SSE2_EXTRA_CFLAGS)
 fi
 
 
diff --git a/extensions/Makefile.am b/extensions/Makefile.am
index 2636f17..30ac8c5 100644
--- a/extensions/Makefile.am
+++ b/extensions/Makefile.am
@@ -21,16 +21,18 @@ ext_LTLIBRARIES = \
 	gggl-lies.la    \
 	gggl.la         \
 	gimp-8bit.la    \
-	float.la    \
-	fast-float.la    \
+	float.la        \
+	fast-float.la   \
 	naive-CMYK.la   \
-	HSV.la   \
+	HSV.la          \
 	simple.la       \
-	sse-fixups.la
+	sse-fixups.la   \
+	sse2-float.la    \
+	sse2-int16.la
 
 cairo_la_SOURCES = cairo.c
 CIE_la_SOURCES = CIE.c
-expar_la_SOURCES = expar.c
+simple_la_SOURCES = simple.c
 gegl_fixups_la_SOURCES = gegl-fixups.c
 gggl_lies_la_SOURCES = gggl-lies.c
 gggl_la_SOURCES = gggl.c
@@ -38,9 +40,13 @@ gimp_8bit_la_SOURCES = gimp-8bit.c
 naive_CMYK_la_SOURCES = naive-CMYK.c
 HSV_la_SOURCES = HSV.c
 sse_fixups_la_SOURCES = sse-fixups.c
+sse2_float_la_SOURCES = sse2-float.c
+sse2_int16_la_SOURCES = sse2-int16.c
 float_la_SOURCES = float.c
 fast_float_la_SOURCES = fast-float.c
 
 LIBS = $(top_builddir)/babl/libbabl-@BABL_API_VERSION@.la $(MATH_LIB)
 
-sse_fixups_la_CFLAGS = $(MMX_EXTRA_CFLAGS) $(SSE_EXTRA_CFLAGS)
+sse_fixups_la_CFLAGS = $(SSE_EXTRA_CFLAGS)
+sse2_float_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
+sse2_int16_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
diff --git a/extensions/sse2-float.c b/extensions/sse2-float.c
new file mode 100644
index 0000000..954e359
--- /dev/null
+++ b/extensions/sse2-float.c
@@ -0,0 +1,299 @@
+/* babl - dynamically extendable universal pixel conversion library.
+ * Copyright (C) 2013 Massimo Valentini
+ * Copyright (C) 2013 Daniel Sabo
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#if defined(USE_SSE2)
+
+/* SSE 2 */
+#include <emmintrin.h>
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "babl.h"
+#include "babl-cpuaccel.h"
+#include "base/util.h"
+#include "extensions/util.h"
+
+#define Q(a) { a, a, a, a }
+
+static const float BABL_ALPHA_THRESHOLD_FLOAT = (float)BABL_ALPHA_THRESHOLD;
+
+static long
+conv_rgbaF_linear_rgbAF_linear (const float *src, float *dst, long samples)
+{
+  long i = 0;
+  long remainder;
+
+  if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
+    {
+      const long    n = (samples / 2) * 2;
+      const __v4sf *s = (const __v4sf*) src;
+            __v4sf *d = (__v4sf*)dst;
+
+      for ( ; i < n; i += 2)
+        {
+          __v4sf rbaa0, rbaa1;
+        
+          __v4sf rgba0 = *s++;
+          __v4sf rgba1 = *s++;
+
+          /* Expand alpha */
+          __v4sf aaaa0 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba0, _MM_SHUFFLE(3, 3, 3, 3));
+          __v4sf aaaa1 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba1, _MM_SHUFFLE(3, 3, 3, 3));
+          
+          /* Premultiply */
+          rgba0 = rgba0 * aaaa0;
+          rgba1 = rgba1 * aaaa1;
+          
+          /* Shuffle the original alpha value back in */
+          rbaa0 = _mm_shuffle_ps(rgba0, aaaa0, _MM_SHUFFLE(0, 0, 2, 0));
+          rbaa1 = _mm_shuffle_ps(rgba1, aaaa1, _MM_SHUFFLE(0, 0, 2, 0));
+          
+          rgba0 = _mm_shuffle_ps(rgba0, rbaa0, _MM_SHUFFLE(2, 1, 1, 0));
+          rgba1 = _mm_shuffle_ps(rgba1, rbaa1, _MM_SHUFFLE(2, 1, 1, 0));
+          
+          *d++ = rgba0;
+          *d++ = rgba1;
+        }
+      _mm_empty ();
+    }
+
+  dst += i * 4;
+  src += i * 4;
+  remainder = samples - i;
+  while (remainder--)
+  {
+    const float a = src[3];
+    dst[0] = src[0] * a;
+    dst[1] = src[1] * a;
+    dst[2] = src[2] * a;
+    dst[3] = a;
+    
+    src += 4;
+    dst += 4;
+  }
+
+  return samples;
+}
+
+static long
+conv_rgbAF_linear_rgbaF_linear_shuffle (const float *src, float *dst, long samples)
+{
+  long i = 0;
+  long remainder;
+
+  if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
+    {
+      const long    n = samples;
+      const __v4sf *s = (const __v4sf*) src;
+            __v4sf *d = (__v4sf*)dst;
+
+      for ( ; i < n; i += 1)
+        {
+          __v4sf pre_rgba0, rgba0, rbaa0, raaaa0;
+          
+          float alpha0 = ((float *)s)[3];
+          pre_rgba0 = *s;
+          
+          if (alpha0 <= 0.0f)
+          {
+            /* Zero RGB */
+            rgba0 = _mm_setzero_ps();
+          }
+          else
+          {
+            float recip0 = 1.0f/alpha0;
+            
+            /* Expand reciprocal */
+            raaaa0 = _mm_load1_ps(&recip0);
+            
+            /* Un-Premultiply */
+            rgba0 = pre_rgba0 * raaaa0;
+          }
+            
+          /* Shuffle the original alpha value back in */
+          rbaa0 = _mm_shuffle_ps(rgba0, pre_rgba0, _MM_SHUFFLE(3, 3, 2, 0));
+          rgba0 = _mm_shuffle_ps(rgba0, rbaa0, _MM_SHUFFLE(2, 1, 1, 0));
+          
+          s++;
+          *d++ = rgba0;
+        }
+      _mm_empty ();
+    }
+
+  dst += i * 4;
+  src += i * 4;
+  remainder = samples - i;
+  while (remainder--)
+    {
+      float alpha = src[3];
+      float recip;
+      if (alpha <= 0.0f)
+        recip = 0.0f;
+      else
+        recip = 1.0f/alpha;
+      dst[0] = src[0] * recip;
+      dst[1] = src[1] * recip;
+      dst[2] = src[2] * recip;
+      dst[3] = alpha;
+      
+      src   += 4;
+      dst   += 4;
+    }
+
+  return samples;
+}
+
+static long
+conv_rgbAF_linear_rgbaF_linear_spin (const float *src, float *dst, long samples)
+{
+  long i = 0;
+  long remainder;
+
+  if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
+    {
+      const long    n = samples;
+      const __v4sf *s = (const __v4sf*) src;
+            __v4sf *d = (__v4sf*)dst;
+      const __v4sf zero = _mm_setzero_ps();
+      const __v4sf one = _mm_set_ss(1.0f);
+
+      for ( ; i < n; i += 1)
+        {
+          __v4sf pre_abgr0, abgr0, rgba0, raaaa0;
+          
+          
+          rgba0 = *s;
+          /* Rotate to ABGR */
+          pre_abgr0 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba0, _MM_SHUFFLE(0, 1, 2, 3));
+          
+          if (_mm_ucomile_ss(pre_abgr0, zero))
+          {
+            /* Zero RGB */
+            abgr0 = zero;
+          }
+          else
+          {
+            /* Un-Premultiply */
+            raaaa0 = _mm_div_ss(one, pre_abgr0);
+            
+            /* Expand reciprocal */
+            raaaa0 = (__v4sf)_mm_shuffle_epi32((__m128i)raaaa0, _MM_SHUFFLE(0, 0, 0, 0));
+            
+            /* Un-Premultiply */
+            abgr0 = pre_abgr0 * raaaa0;
+          }
+          
+          /* Move the original alpha value back in */
+          abgr0 = _mm_move_ss(abgr0, pre_abgr0);
+          
+          /* Rotate to ABGR */
+          rgba0 = (__v4sf)_mm_shuffle_epi32((__m128i)abgr0, _MM_SHUFFLE(0, 1, 2, 3));
+          
+          *d++ = rgba0;
+          s++;
+        }
+      _mm_empty ();
+    }
+
+  dst += i * 4;
+  src += i * 4;
+  remainder = samples - i;
+  while (remainder--)
+    {
+      float alpha = src[3];
+      float recip;
+      if (alpha <= 0.0f)
+        recip = 0.0f;
+      else
+        recip = 1.0f/alpha;
+      dst[0] = src[0] * recip;
+      dst[1] = src[1] * recip;
+      dst[2] = src[2] * recip;
+      dst[3] = alpha;
+      
+      src   += 4;
+      dst   += 4;
+    }
+
+  return samples;
+}
+
+#endif /* defined(USE_SSE2) */
+
+#define o(src, dst) \
+  babl_conversion_new (src, dst, "linear", conv_ ## src ## _ ## dst, NULL)
+
+int init (void);
+
+int
+init (void)
+{
+#if defined(USE_SSE2)
+
+  const Babl *rgbaF_linear = babl_format_new (
+    babl_model ("RGBA"),
+    babl_type ("float"),
+    babl_component ("R"),
+    babl_component ("G"),
+    babl_component ("B"),
+    babl_component ("A"),
+    NULL);
+  const Babl *rgbAF_linear = babl_format_new (
+    babl_model ("RaGaBaA"),
+    babl_type ("float"),
+    babl_component ("Ra"),
+    babl_component ("Ga"),
+    babl_component ("Ba"),
+    babl_component ("A"),
+    NULL);
+
+  if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE) &&
+      (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE2))
+      
+    {
+      babl_conversion_new(rgbaF_linear, 
+                          rgbAF_linear,
+                          "linear",
+                          conv_rgbaF_linear_rgbAF_linear,
+                          NULL);
+                          
+      /* Which of these is faster varies by CPU, and the difference
+       * is big enough that it's worthwhile to include both and
+       * let them fight it out in the babl benchmarks.
+       */
+      babl_conversion_new(rgbAF_linear, 
+                          rgbaF_linear,
+                          "linear",
+                          conv_rgbAF_linear_rgbaF_linear_shuffle,
+                          NULL);
+      babl_conversion_new(rgbAF_linear, 
+                          rgbaF_linear,
+                          "linear",
+                          conv_rgbAF_linear_rgbaF_linear_spin,
+                          NULL);
+    }
+
+#endif /* defined(USE_SSE2) */
+
+  return 0;
+}
+
diff --git a/extensions/sse2-int16.c b/extensions/sse2-int16.c
new file mode 100644
index 0000000..252d1a7
--- /dev/null
+++ b/extensions/sse2-int16.c
@@ -0,0 +1,186 @@
+/* babl - dynamically extendable universal pixel conversion library.
+ * Copyright (C) 2013 Massimo Valentini
+ * Copyright (C) 2013 Daniel Sabo
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#if defined(USE_SSE2)
+
+/* SSE 2 */
+#include <emmintrin.h>
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "babl.h"
+#include "babl-cpuaccel.h"
+#include "extensions/util.h"
+
+#define Q(a) { a, a, a, a }
+static const __v4sf  u16_float = Q (1.f / 65535);
+
+static long
+conv_rgba16_linear_rgbaF_linear (const uint16_t *src, float *dst, long samples)
+{
+  long i = 0;
+
+  if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
+    {
+      long           n  = (samples / 2) * 2;
+      const __m128i *s  = (const __m128i*) src;
+            __v4sf  *d  = (__v4sf*) dst;
+
+      for (; i < n / 2; i++)
+        {
+          /* Expand shorts to ints by loading zero in the high bits */
+          const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());
+          const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());
+
+          /* Convert to float */
+          const __m128  u0 = _mm_cvtepi32_ps (t0);
+          const __m128  u1 = _mm_cvtepi32_ps (t1);
+
+          const __v4sf rgba0 = u0 * u16_float;
+          const __v4sf rgba1 = u1 * u16_float;
+
+          d[2 * i + 0] = rgba0;
+          d[2 * i + 1] = rgba1;
+        }
+      _mm_empty();
+    }
+
+  for (i *= 2 * 4; i != 4 * samples; i++)
+    dst[i] = src[i] * (1.f / 65535);
+
+  return samples;
+}
+
+static long
+conv_rgba16_linear_rgbAF_linear (const uint16_t *src, float *dst, long samples)
+{
+  long i = 0;
+  long remainder;
+
+  if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
+    {
+      long           n  = (samples / 2) * 2;
+      const __m128i *s  = (const __m128i*) src;
+            __v4sf  *d  = (__v4sf*) dst;
+
+      const __v4sf  max_mask = { 0.0f, 0.0f, 0.0f, 1.0f };
+
+      for (; i < n / 2; i++)
+        {
+          /* Expand shorts to ints by loading zero in the high bits */
+          const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());
+          const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());
+
+          /* Convert to float */
+          const __m128  u0 = _mm_cvtepi32_ps (t0);
+          const __m128  u1 = _mm_cvtepi32_ps (t1);
+
+          /* Multiply by 1 / 65535 */
+          __v4sf rgba0 = u0 * u16_float;
+          __v4sf rgba1 = u1 * u16_float;
+          
+          /* Expand alpha */
+          __v4sf aaaa0 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba0, _MM_SHUFFLE(3, 3, 3, 3));
+          __v4sf aaaa1 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba1, _MM_SHUFFLE(3, 3, 3, 3));
+          
+          /* Set the value in the alpha slot to 1.0, we know max is sufficent because alpha was a short */
+          aaaa0 = _mm_max_ps(aaaa0, max_mask);
+          aaaa1 = _mm_max_ps(aaaa1, max_mask);
+          
+          /* Premultiply */
+          rgba0 = rgba0 * aaaa0;
+          rgba1 = rgba1 * aaaa1;
+          
+          d[2 * i + 0] = rgba0;
+          d[2 * i + 1] = rgba1;
+        }
+      _mm_empty();
+    }
+
+  dst += i * 2 * 4;
+  src += i * 2 * 4;
+  remainder = samples - (i * 2);
+  while (remainder--)
+  {
+    const float a = src[3] / 65535.0f;
+    const float a_term = a / 65535.0f;
+    dst[0] = src[0] * a_term;
+    dst[1] = src[1] * a_term;
+    dst[2] = src[2] * a_term;
+    dst[3] = a;
+    
+    src += 4;
+    dst += 4;
+  }
+
+  return samples;
+}
+
+#endif /* defined(USE_SSE2) */
+
+#define o(src, dst) \
+  babl_conversion_new (src, dst, "linear", conv_ ## src ## _ ## dst, NULL)
+
+int init (void);
+
+int
+init (void)
+{
+#if defined(USE_SSE2)
+
+  const Babl *rgbaF_linear = babl_format_new (
+    babl_model ("RGBA"),
+    babl_type ("float"),
+    babl_component ("R"),
+    babl_component ("G"),
+    babl_component ("B"),
+    babl_component ("A"),
+    NULL);
+  const Babl *rgbAF_linear = babl_format_new (
+    babl_model ("RaGaBaA"),
+    babl_type ("float"),
+    babl_component ("Ra"),
+    babl_component ("Ga"),
+    babl_component ("Ba"),
+    babl_component ("A"),
+    NULL);
+  const Babl *rgba16_linear = babl_format_new (
+    babl_model ("RGBA"),
+    babl_type ("u16"),
+    babl_component ("R"),
+    babl_component ("G"),
+    babl_component ("B"),
+    babl_component ("A"),
+    NULL);
+
+  if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE) &&
+      (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE2))
+    {
+      o (rgba16_linear, rgbaF_linear);
+      o (rgba16_linear, rgbAF_linear);
+    }
+
+#endif /* defined(USE_SSE2) */
+
+  return 0;
+}
+
-- 
2.30.2